So, I want to go on vacations and I want to book my self-catering property on Airbnb. But I don't know where to go. So I will compare few datasets about different cities on Airbnb.
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import folium
import plotly.offline as pyo
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.express as px
I will compare the city of New York in the USA and Buenos Aires in Argentina.
The dataset is composed of 16 columns in both dataset :
ny_airbnb = pd.read_csv('./new-york-city-airbnb-open-data/AB_NYC_2019.csv')
ba_airbnb = pd.read_csv('Buenos_Aires_AIRBNB.csv')
ny_airbnb.head()
ba_airbnb.head()
So, as we can see we have the price for each data sets in the money of the country.
In order to make sense to these values, we are going to convert the price colum into Euros.
We use google to convert prices.
ny_airbnb['price'] /= 1.08
ba_airbnb['price'] /= 66.69
Now we want to add another column to range prices. We will range them in categorize 50 by 50€ when it's bigger than 100€.
def categorize_price(df):
labels = ['1. Between 0 and 25€',
'2. Between 25 and 50€',
'3. Between 50 and 100€',
'4. Between 100 and 150€',
'5. Between 150 and 200€',
'6. Between 200 and 250€',
'7. Between 250 and 300€',
'8. Between 300 and 500€',
'9. Between 500 and '+ str(int(max(df['price']))) + '€']
bins=[0,25,50,100,150,200,250,300,500, max(df['price'])]
price_cat = pd.cut(df['price'], bins=bins, include_lowest=True, labels=labels)
return pd.concat([df, price_cat], axis=1)
def categorize_nights(df):
labels = ['1 night',
'Between 2 and 7 nights',
'Between 7 and 14 nights',
'Between 14 and 21 nights',
'Between 21 and 30 nights',
'Between 30 and 60 nights',
'Between 60 and 90 nights',
'Between 90 and 365 nights',
'Between 365 and '+ str(int(max(df['minimum_nights']))) + ' nights']
bins=[1,2,7,14,21,30,60,90,365, max(df['minimum_nights'])]
price_cat = pd.cut(df['minimum_nights'], bins=bins, include_lowest=True, labels=labels)
return pd.concat([df, price_cat], axis=1)
ba_airbnb=categorize_price(ba_airbnb)
ny_airbnb=categorize_price(ny_airbnb)
ba_airbnb=categorize_nights(ba_airbnb)
ny_airbnb=categorize_nights(ny_airbnb)
ny_airbnb.columns = ['id', 'name', 'host_id',
'host_name', 'neighbourhood_group','neighbourhood',
'latitude', 'longitude', 'room_type',
'price','minimum_nights', 'number_of_reviews',
'last_review','reviews_per_month', 'calculated_host_listings_count',
'availability_365', 'price_cat', 'minimum_nights_cat']
ba_airbnb.columns = ['id', 'name', 'host_id',
'host_name','neighbourhood',
'latitude', 'longitude', 'room_type',
'price','minimum_nights', 'number_of_reviews',
'last_review','reviews_per_month', 'calculated_host_listings_count',
'availability_365', 'price_cat', 'minimum_nights_cat']
ny_airbnb.sort_values(['price_cat'], ascending=True)
ba_airbnb.sort_values(['price_cat'], ascending=True);
ny_airbnb.head()
ba_airbnb.head()
Now, we'll start our analysis.
First, we compare the size of the two datasets and we can visualise just the mean price and the median price.
print('Number of values in NY:', ny_airbnb['id'].count())
print('Number of values in BA:', ba_airbnb['id'].count())
print('\nMean price in Buenos Aires :',round(ba_airbnb['price'].mean()), '€\nMean price in New York :',round(ny_airbnb['price'].mean()),'€')
print('\nMedian price in Buenos Aires :',round(ba_airbnb['price'].median()), '€\nMedian price in New York :',round(ny_airbnb['price'].median()),'€')
Now, we will look at the repartition of Airbnbs in the two cities.
from folium.plugins import HeatMap
m=folium.Map([40.7128,-74.0060],zoom_start=10)
HeatMap(ny_airbnb[['latitude','longitude']].dropna(),radius=8,gradient={0.2:'blue',0.4:'purple',0.6:'orange',1.0:'red'}).add_to(m)
display(m)
#Review the listings by boroname
plt.figure(figsize=(10,10))
sns.scatterplot(x='longitude', y='latitude', hue='neighbourhood_group',s=20, data=ny_airbnb)
from folium.plugins import HeatMap
m=folium.Map([-34.5899,-58.416363504830635],zoom_start=11)
HeatMap(ba_airbnb[['latitude','longitude']].dropna(),radius=8,gradient={0.2:'blue',0.4:'purple',0.6:'orange',1.0:'red'}).add_to(m)
display(m)
The dataset of Buenos Aires doesn't have the column neighbourhood_group. So it has a lot of neighbourhood.
For the visualisation, we will remove all neighbourhoods which have less than 1000 airbnbs.
print('Before removing Neighbourhood under 1000 values :',
len(ba_airbnb['neighbourhood'].unique()),
' neighbourhoods\n',
ba_airbnb['neighbourhood'].value_counts()
)
counts = ba_airbnb['neighbourhood'].value_counts()
ba_airbnb=ba_airbnb[ba_airbnb['neighbourhood'].isin(counts.index[counts > 1000])]
print('After removing Neighbourhood under 1000 values :',
len(ba_airbnb['neighbourhood'].unique()),
'neighbourhoods\n',
ba_airbnb['neighbourhood'].value_counts(),
'\nNumber of values in the dataset:',
ba_airbnb['neighbourhood'].count()
)
plt.figure(figsize=(10,10))
sns.scatterplot(x='longitude', y='latitude', hue='neighbourhood',s=20, data=ba_airbnb)
fig = make_subplots(
rows=1, cols=2,
shared_xaxes=True,
vertical_spacing=0.03,
specs=[[{'type':'domain'},
{'type':'domain'}]]
)
roomba_airbnb = ba_airbnb.groupby('neighbourhood').size()/ba_airbnb['neighbourhood'].count()*100
labels = roomba_airbnb.index
values = roomba_airbnb.values
fig_ba = go.Pie(labels=labels, values=values, hole=.5, name="Buenos Air")
roomny_airbnb = ny_airbnb.groupby('neighbourhood_group').size()/ny_airbnb['neighbourhood_group'].count()*100
labels = roomny_airbnb.index
values = roomny_airbnb.values
fig_ny = go.Pie(labels=labels, values=values, hole=.5, name="New York")
fig.add_trace(fig_ba, row=1, col=1)
fig.add_trace(fig_ny, row=1, col=2)
fig.update_layout(
title_text="Repartition of room type in New York and Buenos Air",
# Add annotations in the center of the donut pies.
annotations=[dict(text='Buenos Air', x=0.15, y=0.5, font_size=20, showarrow=False),
dict(text='New York', x=0.84, y=0.5, font_size=20, showarrow=False)])
fig.show()
So it's seems like in NY, I will have only the choice of two neighbourhoods : Manhattan and Brooklyn.
In BA, I have more choices, but if I want to be close to the sea, the geolocalisation shows that Palermo would be the best neighbourhood.
Now, we'll look at neighbourhoods, we'll look at the price per neighbourhood to make our decisions.
fig = make_subplots(
rows=1, cols=2,
shared_xaxes=True,
vertical_spacing=0.03,
specs=[[{'type':'domain'},
{'type':'domain'}]]
)
roomba_airbnb = ba_airbnb.groupby('price_cat').size()/ba_airbnb['price_cat'].count()*100
labels = roomba_airbnb.index
values = roomba_airbnb.values
fig_ba = go.Pie(labels=labels, values=values, hole=.5, name="Buenos Aires", sort=False)
roomny_airbnb = ny_airbnb.groupby('price_cat').size()/ny_airbnb['price_cat'].count()*100
labels = roomny_airbnb.index
values = roomny_airbnb.values
fig_ny = go.Pie(labels=labels, values=values, hole=.5, name="New York", sort=False)
fig.add_trace(fig_ba, row=1, col=1)
fig.add_trace(fig_ny, row=1, col=2)
fig.update_layout(
title_text="Repartition of price in New York and Buenos Aires",
# Add annotations in the center of the donut pies.
annotations=[dict(text='Buenos Aires', x=0.13, y=0.5, font_size=20, showarrow=False),
dict(text='New York', x=0.84, y=0.5, font_size=20, showarrow=False)])
fig.show()
ng = ba_airbnb[ba_airbnb.price <250]
fig = px.box(ng, x="neighbourhood", y="price", color="neighbourhood")
fig.update_layout(title_text="Distribution of Neighbourhood price under 250 in Buenos Air")
fig.show()
ng = ny_airbnb[ny_airbnb.price <250]
fig = px.box(ng, x="neighbourhood_group", y="price", color="neighbourhood_group")
fig.update_layout(title_text="Distribution of Neighbourhood price under 250 in New York")
fig.show()
As we can expect, New York is way more expensive than Buenos Aires. But let's face it, with the actual situation (COVID-19) it's a bit difficult to go anywhere.
But don't let us be demoralized and keep looking for our perfect destination.
So now, we'll looking for what kind of locations we'll make and how many nights we can spend in our airbnb.
fig = make_subplots(
rows=1, cols=2,
shared_xaxes=True,
vertical_spacing=0.03,
specs=[[{'type':'domain'},
{'type':'domain'}]]
)
roomba_airbnb = ba_airbnb.groupby('minimum_nights_cat').size()/ba_airbnb['minimum_nights_cat'].count()*100
labels = roomba_airbnb.index
values = roomba_airbnb.values
fig_ba = go.Pie(labels=labels, values=values, hole=.5, name="Buenos Aires", sort=False)
roomny_airbnb = ny_airbnb.groupby('minimum_nights_cat').size()/ny_airbnb['minimum_nights_cat'].count()*100
labels = roomny_airbnb.index
values = roomny_airbnb.values
fig_ny = go.Pie(labels=labels, values=values, hole=.5, name="New York", sort=False)
fig.add_trace(fig_ba, row=1, col=1)
fig.add_trace(fig_ny, row=1, col=2)
fig.update_layout(
title_text="Repartition of minimum nights in New York and Buenos Aires",
# Add annotations in the center of the donut pies.
annotations=[dict(text='Buenos Aires', x=0.13, y=0.5, font_size=20, showarrow=False),
dict(text='New York', x=0.84, y=0.5, font_size=20, showarrow=False)])
fig.show()
fig = make_subplots(
rows=1, cols=2,
shared_xaxes=True,
vertical_spacing=0.03,
specs=[[{'type':'domain'},
{'type':'domain'}]]
)
roomba_airbnb = ba_airbnb.groupby('room_type').size()/ba_airbnb['room_type'].count()*100
labels = roomba_airbnb.index
values = roomba_airbnb.values
fig_ba = go.Pie(labels=labels, values=values, hole=.5, name="Buenos Aires")
roomny_airbnb = ny_airbnb.groupby('room_type').size()/ny_airbnb['room_type'].count()*100
labels = roomny_airbnb.index
values = roomny_airbnb.values
fig_ny = go.Pie(labels=labels, values=values, hole=.5, name="New York")
fig.add_trace(fig_ba, row=1, col=1)
fig.add_trace(fig_ny, row=1, col=2)
fig.update_layout(
title_text="Repartition of room type in New York and Buenos Aires",
annotations=[dict(text='Buenos Aires', x=0.13, y=0.5, font_size=20, showarrow=False),
dict(text='New York', x=0.84, y=0.5, font_size=20, showarrow=False)])
fig.show()
plt.figure(figsize=(10,6))
sns.countplot(x = 'room_type',hue = "neighbourhood",data = ba_airbnb)
plt.title("Room types occupied range by neighbourhood in Buenos Aires")
plt.show()
plt.figure(figsize=(10,6))
sns.countplot(x = 'room_type',hue = "neighbourhood_group",data = ny_airbnb)
plt.title("Room types occupied range by neighbourhood in New York")
plt.show()
#room_type - price
result = ny_airbnb.groupby(["room_type"])['price'].aggregate(np.median).reset_index().sort_values('price')
sns.barplot(x='room_type', y="price", data=ny_airbnb, order=result['room_type'])
plt.title("Room types occupied range by price in New York")
plt.show()
#room_type - price
result = ba_airbnb.groupby(["room_type"])['price'].aggregate(np.median).reset_index().sort_values('price')
sns.barplot(x='room_type', y="price", data=ny_airbnb, order=result['room_type'])
plt.title("Room types occupied range by price in Buenos Aires")
plt.show()
So as we see, the majority of locations are Entire home/apt and we also can choose Private room.
The price is also variated. An etire home is way more expensive than a private since you have less space. So it's quite normal.
Now it just depends on what we want for our vacations, if we want to spent our week with a native or just with some friends.
Let's see the repartition of price with the localisation.
labels_ba = ['1. Between 0 and 25€',
'2. Between 25 and 50€',
'3. Between 50 and 100€',
'4. Between 100 and 150€',
'5. Between 150 and 200€',
'6. Between 200 and 250€',
'7. Between 250 and 300€',
'8. Between 300 and 500€',
'9. Between 500 and '+ str(int(max(ba_airbnb['price']))) + '€']
fig = px.scatter_mapbox(ba_airbnb, lat=ba_airbnb['latitude'],
lon=ba_airbnb['longitude'], hover_name="price_cat",
color_continuous_scale=px.colors.cyclical.IceFire,
color='price_cat',
zoom=11, height=300, size_max=15, category_orders={'price_cat': labels_ba})
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
labels_ny = ['1. Between 0 and 25€',
'2. Between 25 and 50€',
'3. Between 50 and 100€',
'4. Between 100 and 150€',
'5. Between 150 and 200€',
'6. Between 200 and 250€',
'7. Between 250 and 300€',
'8. Between 300 and 500€',
'9. Between 500 and '+ str(int(max(ny_airbnb['price']))) + '€']
fig = px.scatter_mapbox(ny_airbnb, lat=ny_airbnb['latitude'],
lon=ny_airbnb['longitude'], hover_name="price_cat",
color_continuous_scale=px.colors.cyclical.IceFire,
color='price_cat',zoom=8, height=300, category_orders={'price_cat': labels_ny})
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
So, it's not quite readable, I will try to make this more readable with a gif.
list_of_color = [['blue'],
['red'],
['green'],
['purple'],
['orange'],
['royalblue'],
['maroon'],
['darkgreen'],
['magenta'],
]
price_cat = list(set(ny_airbnb['price_cat']))
price_cat.sort()
for index, val in enumerate(price_cat):
temp_df = ny_airbnb[ny_airbnb['price_cat'] == val]
fig = px.scatter_mapbox(temp_df,
lat=temp_df['latitude'],
lon=temp_df['longitude'],
hover_name="price_cat",
color_discrete_sequence=list_of_color[index],
color='price_cat',
zoom=9,
height=300,
size_max=15)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.write_image("./price_ny/"+val+".png")
price_cat = list(set(ba_airbnb['price_cat']))
price_cat.sort()
for index,val in enumerate(price_cat):
temp_df = ba_airbnb[ba_airbnb['price_cat'] == val]
fig = px.scatter_mapbox(temp_df,
lat=temp_df['latitude'],
lon=temp_df['longitude'],
hover_name="price_cat",
color_discrete_sequence=list_of_color[index],
color='price_cat',
zoom=10,
height=300,
size_max=15)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.write_image("./price_ba/"+val+".png")
With theses frames, I create a gif using convert and the image saved directly in the terminal.
For New York :
> cd price_ny/
> convert -delay 100 -loop 0 "1. Between 0 and 25€.png" "2. Between 25 and 50€.png" "3. Between 50 and 100€.png" "4. Between 100 and 150€.png" "5. Between 150 and 200€.png" "6. Between 200 and 250€.png" "7. Between 250 and 300€.png" "8. Between 300 and 500€.png" "9. Between 500 and 9259€.png" cat_ny.gif
For Buenos Aires :
> cd price_ba/
> convert -delay 100 -loop 0 "1. Between 0 and 25€.png" "2. Between 25 and 50€.png" "3. Between 50 and 100€.png" "4. Between 100 and 150€.png" "5. Between 150 and 200€.png" "6. Between 200 and 250€.png" "7. Between 250 and 300€.png" "8. Between 300 and 500€.png" "9. Between 500 and 9259€.png" cat_ba.gif
Now, we just want to see what could be the most used words in those two datasets.


Now, we want to see what could be the most used word in the two datasets.
To see maybe if we can take some place to visit. How knows ?
from wordcloud import WordCloud, ImageColorGenerator,STOPWORDS
text = " ".join(str(each) for each in ny_airbnb.name)
# Create and generate a word cloud image:
stopwords=set(STOPWORDS)
wordcloud = WordCloud(max_words=200,
background_color="black",
max_font_size=92,
contour_width=3,
contour_color='steelblue',
stopwords=stopwords)
wordcloud.generate(text)
#plt.figure(figsize=(10,6))
plt.figure(figsize=(15,10))
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
text = " ".join(str(each) for each in ba_airbnb.name)
# Create and generate a word cloud image:
stopwords=set(STOPWORDS)
wordcloud = WordCloud(max_words=200,
background_color="black",
max_font_size=92,
contour_width=3,
contour_color='steelblue',
stopwords=stopwords)
wordcloud.generate(text)
#plt.figure(figsize=(10,6))
plt.figure(figsize=(15,10))
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()